import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import plotly.express as px
df = pd.read_csv(r"C:\Users\LALIT YADAV\Downloads\casestudy.csv", sep=',')
df = df.drop(df.columns[0], axis=1)
df['year']=df['year'].astype(str)
df['customer_email']=df['customer_email'].astype(str)
df['net_revenue']=df['net_revenue'].astype(int)
df.head()
| customer_email | net_revenue | year | |
|---|---|---|---|
| 0 | nhknapwsbx@gmail.com | 249 | 2015 |
| 1 | joiuzbvcpn@gmail.com | 87 | 2015 |
| 2 | ukkjctepxt@gmail.com | 168 | 2015 |
| 3 | gykatilzrt@gmail.com | 62 | 2015 |
| 4 | mmsgsrtxah@gmail.com | 43 | 2015 |
df.shape
(685927, 3)
df.isnull().any()
customer_email False net_revenue False year False dtype: bool
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 685927 entries, 0 to 685926 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_email 685927 non-null object 1 net_revenue 685927 non-null int32 2 year 685927 non-null object dtypes: int32(1), object(2) memory usage: 13.1+ MB
df.describe(include='all')
| customer_email | net_revenue | year | |
|---|---|---|---|
| count | 685927 | 685927.000000 | 685927 |
| unique | 604618 | NaN | 3 |
| top | qbndytimtr@gmail.com | NaN | 2017 |
| freq | 3 | NaN | 249987 |
| mean | NaN | 125.152837 | NaN |
| std | NaN | 71.853997 | NaN |
| min | NaN | 1.000000 | NaN |
| 25% | NaN | 63.000000 | NaN |
| 50% | NaN | 125.000000 | NaN |
| 75% | NaN | 187.000000 | NaN |
| max | NaN | 250.000000 | NaN |
Yearwise_Revenue = df.groupby(["year"])['net_revenue'].sum().reset_index()
Yearwise_Revenue.columns = ['Year','Total_Revenue']
Yearwise_Revenue['Total_Revenue']=Yearwise_Revenue['Total_Revenue']
display(Yearwise_Revenue)
| Year | Total_Revenue | |
|---|---|---|
| 0 | 2015 | 28922244 |
| 1 | 2016 | 25629598 |
| 2 | 2017 | 31293868 |
plt.figure(figsize=(12,5))
px.bar(data_frame=Yearwise_Revenue, x = 'Year', y = 'Total_Revenue',title="Yearwise Revenue")
<Figure size 864x360 with 0 Axes>
TY = df[df['year']=='2017'] #Considering most recent year
LY = df[df['year']=='2016'] #Considering most last year
LY_2 = df[df['year']=='2015'] #Considering most second last year
#Cheching Data
display(TY.head(),LY.head(),LY_2.head())
| customer_email | net_revenue | year | |
|---|---|---|---|
| 435940 | mwrossuukz@gmail.com | 96 | 2017 |
| 435941 | qvjfterwnk@gmail.com | 176 | 2017 |
| 435942 | vlyigtgfzs@gmail.com | 73 | 2017 |
| 435943 | yfirychuhk@gmail.com | 138 | 2017 |
| 435944 | fdkiioqtli@gmail.com | 156 | 2017 |
| customer_email | net_revenue | year | |
|---|---|---|---|
| 231294 | mwrossuukz@gmail.com | 197 | 2016 |
| 231295 | gkwsoupawk@gmail.com | 38 | 2016 |
| 231296 | vlyigtgfzs@gmail.com | 154 | 2016 |
| 231297 | yfirychuhk@gmail.com | 102 | 2016 |
| 231298 | trnzgqinuu@gmail.com | 32 | 2016 |
| customer_email | net_revenue | year | |
|---|---|---|---|
| 0 | nhknapwsbx@gmail.com | 249 | 2015 |
| 1 | joiuzbvcpn@gmail.com | 87 | 2015 |
| 2 | ukkjctepxt@gmail.com | 168 | 2015 |
| 3 | gykatilzrt@gmail.com | 62 | 2015 |
| 4 | mmsgsrtxah@gmail.com | 43 | 2015 |
merged = pd.merge(TY, LY, how='left',
on='customer_email').fillna("NA")
#merged = merged.drop(['year','net_revenue'], axis = 1) # drop duplicate info
merged = merged.rename(columns={"net_revenue_x": "net_revenue_TY",
"year_x": "TY", "net_revenue_y": "net_revenue_LY", "year_y": "LY"})
merged.head()
| customer_email | net_revenue_TY | TY | net_revenue_LY | LY | |
|---|---|---|---|---|---|
| 0 | mwrossuukz@gmail.com | 96 | 2017 | NA | NA |
| 1 | qvjfterwnk@gmail.com | 176 | 2017 | NA | NA |
| 2 | vlyigtgfzs@gmail.com | 73 | 2017 | NA | NA |
| 3 | yfirychuhk@gmail.com | 138 | 2017 | NA | NA |
| 4 | fdkiioqtli@gmail.com | 156 | 2017 | NA | NA |
New_Cust = merged[merged['net_revenue_LY']=='NA']
#New_Cust = New_Cust.drop(['LY','net_revenue_LY'], axis = 1) # drop duplicate info
New_Customer_Revenue=New_Cust['net_revenue_TY'].sum()
print('New Customer Revenue:',New_Cust['net_revenue_TY'].sum())
New Customer Revenue: 28663005
Existing_Cust = merged[merged['net_revenue_LY']!='NA']
Existing_Customer_Growth=New_Cust['net_revenue_TY'].sum()-Existing_Cust['net_revenue_LY'].sum()
print('Existing Customer Growth:',New_Cust['net_revenue_TY'].sum()-Existing_Cust['net_revenue_LY'].sum())
Existing Customer Growth: 26052730.0
merged = pd.merge(TY, LY, how='right',
on='customer_email').fillna("NA")
#merged = merged.drop(['year','net_revenue'], axis = 1) # drop duplicate info
merged = merged.rename(columns={"net_revenue_x": "net_revenue_TY",
"year_x": "TY", "net_revenue_y": "net_revenue_LY", "year_y": "LY"})
merged.head()
| customer_email | net_revenue_TY | TY | net_revenue_LY | LY | |
|---|---|---|---|---|---|
| 0 | mwrossuukz@gmail.com | NA | NA | 197 | 2016 |
| 1 | gkwsoupawk@gmail.com | NA | NA | 38 | 2016 |
| 2 | vlyigtgfzs@gmail.com | NA | NA | 154 | 2016 |
| 3 | yfirychuhk@gmail.com | NA | NA | 102 | 2016 |
| 4 | trnzgqinuu@gmail.com | NA | NA | 32 | 2016 |
Lost_Cust = merged[merged['net_revenue_TY']=='NA']
Revenue_lost_from_attrition=Lost_Cust['net_revenue_LY'].sum()
print('Revenue lost from attrition: ',Revenue_lost_from_attrition)
Revenue lost from attrition: 23019323
Existing_Customer_Revenue_Current_Year=Existing_Cust['net_revenue_TY'].sum()
print('Existing Customer Revenue Current Year:',Existing_Customer_Revenue_Current_Year)
Existing Customer Revenue Current Year: 2630863
Existing_Customer_Revenue_Last_Year=Existing_Cust['net_revenue_LY'].sum()
print('Existing Customer Revenue Last Year:',Existing_Customer_Revenue_Last_Year)
Existing Customer Revenue Last Year: 2610275.0
Yearwise_Cust_Count = df.groupby(["year"])['customer_email'].count().reset_index()
Yearwise_Cust_Count
| year | customer_email | |
|---|---|---|
| 0 | 2015 | 231294 |
| 1 | 2016 | 204646 |
| 2 | 2017 | 249987 |
plt.figure(figsize=(12,5))
px.bar(data_frame=Yearwise_Cust_Count, x = 'year', y = 'customer_email',title="Yearwise Customer Count")
<Figure size 864x360 with 0 Axes>
New_Cust_Count = New_Cust['customer_email'].count()
Lost_Cust_Count = Lost_Cust['customer_email'].count()
d =pd.DataFrame({'Cust_Type': ['New', 'Lost'], 'Customer_Count': [New_Cust_Count, Lost_Cust_Count]})
display(d)
| Cust_Type | Customer_Count | |
|---|---|---|
| 0 | New | 229028 |
| 1 | Lost | 183687 |
plt.figure(figsize=(12,5))
px.bar(data_frame=d, x = 'Cust_Type', y = 'Customer_Count',title="New vs Lost Customers")
<Figure size 864x360 with 0 Axes>